{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "pd.set_option('display.max_rows', 500)\n",
    "pd.set_option('display.max_columns', 100)\n",
    "\n",
    "from itertools import product\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "from xgboost import XGBRegressor\n",
    "from xgboost import plot_importance\n",
    "\n",
    "def plot_features(booster, figsize):\n",
    "    fig, ax = plt.subplots(1,1,figsize=figsize)\n",
    "    return plot_importance(booster=booster, ax=ax)\n",
    "\n",
    "import time\n",
    "import sys\n",
    "import gc\n",
    "import pickle\n",
    "sys.version_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "items = pd.read_csv('D:/GIT/kaggle_develop/predictSales/input/items.csv')\n",
    "shops = pd.read_csv('D:/GIT/kaggle_develop/predictSales/input/shops.csv')\n",
    "cats = pd.read_csv('D:/GIT/kaggle_develop/predictSales/input/item_categories.csv')\n",
    "train = pd.read_csv('D:/GIT/kaggle_develop/predictSales/input/sales_train_v2.csv')\n",
    "# set index to ID to avoid droping it later\n",
    "test  = pd.read_csv('D:/GIT/kaggle_develop/predictSales/input/test.csv').set_index('ID')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0xb39f550>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkcAAAEFCAYAAADpO0MNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAD3lJREFUeJzt3XuQnXV9x/HPJuESxrTiTAZq60IF/UV60YKKVhGqNFasjQPVWqXTkUmdtkyttRer462OztQpOoK21bEiHaoOVqKrM1hXhqr04gUFLxPzE7GSqRSbVtTQxEiS7R/n7PIN7oFc9myS3ddrhuHsOc95Lr99dvPe5zznPBMzMzMBAGBgxeFeAQCAI4k4AgAoxBEAQCGOAAAKcQQAUKy6vwe3bdu+IG9lO/HEE3LXXTsWYlbcD+O8OIzz4jDOi8M4Lw7jvDjWrl0zsRDzWZQjR6tWrVyMxSx7xnlxGOfFYZwXh3FeHMb56OJlNQCAQhwBABTiCACgEEcAAIU4AgAoxBEAQCGOAAAKcQQAUIgjAIBCHAEAFOIIAKAQRwAAhTgCACjEEQBAIY4AAApxBABQiCMAgEIcAQAU4ggAoFg17gW8//3vyerVx+ZZz3rOuBcFAHDIxn7kaHr6o5mamhr3YgAAFoSX1QAACnEEAFCIIwCAQhwBABTiCACgEEcAAIU4AgAoxBEAQCGOAAAKcQQAUIgjAIBCHAEAFOIIAKAQRwAAhTgCACjEEQBAIY4AAApxBABQiCMAgEIcAQAU4ggAoBBHAACFOAIAKMQRAEAhjgAACnEEAFCIIwCAQhwBABTiCACgEEcAAIU4AgAoxBEAQCGOAAAKcQQAUIgjAIBCHAEAFOIIAKAQRwAAhTgCACjEEQBAIY4AAApxBABQrBr3Avbu3TvuRQAALBhHjgAACnEEAFCIIwCAQhwBABTiCACgEEcAAIU4AgAoxBEAQCGOAAAKcQQAUIgjAIBCHAEAFOIIAKAQRwAAhTgCACjEEQBAIY4AAApxBABQiCMAgEIcAQAU4ggAoBBHAACFOAIAKMQRAEAhjgAACnEEAFCIIwCAQhwBABTiCACgEEcAAIU4AgAoxBEAQCGOAAAKcQQAUIgjAIBCHAEAFOIIAKAQRwAAhTgCAChWLfQMt2zZnK1bv5nJyVNz+eV/NXf/JZc8f0GXs2LFikxMTGTPnj2ZmJjIqlWrcs899yRJTjvtEdm5c0fuvnt7Hv7w07N+/QVJks997tNJkrvu+k5OPPEhedzjnjC3rlu3fjPf/vadOemkk+eWMTl56tztdevOmNu+JNm69ZtJkvXrL8j09HX7TL9u3Rlz080+b5Tp6evmljs5eeoDTl/VZVx99ZW5885v5bjjVmfdujPmtrl+Pw5k3vMtZ9Z957O/27oUbNmyOXfeeUJOPvnUw70qcFRbTr83OPoseBxNTV2brVtvz+TkKdm1a9dCz37O3r17527PzMzMhVGS3HbbrXO3b7nlC9m5c2eS5NZb+9xzV6xYkTvu+Nbcum7dent27fpBjjvu+LnnTk6eMnd79gd4auraJMnWrbcnGcTR1NSmfaZft+6Mueke6Ad/amrT3HInJ085oF8UdRmf/OQNc2PS+5a5OKrfj4P9JTS7nFn3nc/+butSMDV1bY45ZmVe+tJXHO5VgaPacvq9wdFnQeNoy5bN6f2rSTL3/yPBfOuyd+/eedd1584d8z5v9q+c+87riisum3vO7GPT09fN3d6yZfPIH/7p6evmnrtz5470/tX7nb6qY33FFZftE4s7d+7I9PR1mZw8dZ9t3N95j1pOva8eSdufbV0KltO2wjj5WeJIt6DnHN33CMNSMjV17bzbd8stX5hn2k37PG/0PDfNc9/+jWGdbtQ63HdeB/P9me859b5Rt5ei5bStME5+ljjSOSEbAKBY0DjasOGihZzdEWXDhovm3b7HPObMeaa9cJ/njZ7nhfPct39jWKcbtQ73ndfBfH/me069b9TtpWg5bSuMk58ljnQLes7RunVnpLVHzZ0AfKScd9Tao5L86AnZj3hEO+ATsmfnNXtC9otf/Ce59NKN+0y/fv0Fufnmz+/zvPnMnsx9MCdkz4717Dps3Hjx3HlHq1efMHdCdv1+HMzr+nU59b75Hl/q5w3Mbusxx6xc8tsK47Scfm9wdFrwd6tt2HDRPm/lH9c71hb7rfx1+5J738o/uO/CH5l+/48AXbjPW/kPRF3Guec+dZ+38tdpZrfxYD3Qtiynv/w2bLgoD37wCYd7NeCot5x+b3D0mZiZmRn54LZt20c/uJ9mP9/oyivfe6iz4gGsXbsm27ZtP9yrseQZ58VhnBeHcV4cxnlxrF27ZmIh5uOEbACAQhwBABTiCACgEEcAAIU4AgAoxBEAQCGOAAAKcQQAUIgjAIBCHAEAFOIIAKAQRwAAhTgCACjEEQBAIY4AAApxBABQiCMAgEIcAQAU4ggAoBBHAACFOAIAKMQRAEAhjgAACnEEAFCIIwCAQhwBABTiCACgEEcAAIU4AgAoxBEAQCGOAAAKcQQAUIgjAIBCHAEAFOIIAKAQRwAAhTgCACjEEQBAIY4AAApxBABQrBr3Alas0F8AwNFDuQAAFOIIAKAQRwAAhTgCACjEEQBAIY4AAApxBABQiCMAgEIcAQAU4ggAoBBHAACFOAIAKMQRAEAhjgAACnEEAFCIIwCAQhwBABTiCACgEEcAAIU4AgAoxBEAQCGOAAAKcQQAUIgjAIBCHAEAFOIIAKAQRwAAhTgCACjEEQBAIY4AAApxBABQiCMAgEIcAQAU4ggAoBBHAACFOAIAKMQRAEAhjgAACnEEAFCIIwCAQhwBABSrxr2A9eufkdWrjx33YgAAFsTY4+i5z31B1q5dk23bto97UQAAh8zLagAAhTgCACjEEQBAIY4AAApxBABQiCMAgEIcAQAU4ggAoBBHAACFOAIAKMQRAEAhjgAACnEEAFCIIwCAQhwBABTiCACgEEcAAIU4AgAoxBEAQCGOAACKiZmZmcO9DgAARwxHjgAACnEEAFCIIwCAQhwBABTiCACgEEcAAIU4AgAoVo1z5q21FUn+Jsmjk+xKsrH3/vVxLnOpa63dnOR7wy//I8k7klyeZHeS6d77Xxj3g9daOzvJG3vv57XWTk9yVZKZJF9JcmnvfW9r7TVJnpnBmL+k9/7ZUdMejm04GtxnnM9M8pEktw4f/tve+zXG+eC11o5JcmWSU5Mcl+T1STbH/rygRozzf8b+vOBaayuTvDNJS7InyQuTTGRM+/S4jxw9O8nxvfcnJvnzJG8a8/KWtNba8UnSez9v+N8Lk7w9yfOTPDnJ2cN/aIz7QWit/VmSv0ty/PCuNyd5Ze/9nAx+CDcMx/fcJGcneV6Svx417WKu+9FknnE+M8mby359jXE+ZBcn+d/hOD0jydtifx6H+cbZ/jwez0qS3vuTkrw6g7Eb2z497jh6cpJ/SpLe+6eTPHbMy1vqHp3khNbadGvthtbaU5Ic13u/rfc+k+RjSZ4W436wbktyYfn6rCSfHN7+aJLzMxjb6d77TO99a5JVrbW1I6ZlfvON8zNba59qrb2rtbYmxvlQ/WOSV5Wvd8f+PA6jxtn+vMB67x9K8qLhl6ck+XbGuE+PO45+LPe+BJQke1prY30pb4nbkeSyJE9P8rtJ3j28b9b2JD8e435Qeu/XJrmn3DUxjM5k9NjO3j/ftMxjnnH+bJI/7b0/Jck3krwmxvmQ9N7v7r1vH/7D/IEkr4z9ecGNGGf785j03ne31v4+yVszGO+x7dPjjqPvJ1lTl9d73z3mZS5lX0vyD8Mi/loGO8BDyuNrknw3xn2h1NejR43t7P3zTcv++WDv/fOzt5P8QozzIWutPSzJPye5uvf+3tifx2KecbY/j1Hv/beTPDKD849Wl4cWdJ8edxz9a5ILkqS19oQkXx7z8pa6SzI8f6i19tAkJyT5v9baaa21iQyOKN0Y475Qbm6tnTe8/YzcO7ZPb62taK1NZhCe/zNiWvbPx1prjx/eflqSz8c4H5LW2klJppO8rPd+5fBu+/MCGzHO9ucxaK39Vmvt5cMvd2QQOzeNa58e90stH0zyy621f8vgBKgXjnl5S927klzVWvuXDM64vySDHeQ9SVZm8DrrZ1prn4txXwh/nOSdrbVjk3w1yQd673taazcm+fcM/ri4dNS0h2OFj1K/l+RtrbUfJrkzyYt67983zofkFUlOTPKq1trsOTF/mOQK+/OCmm+cX5rkLfbnBbcpybtba59KckySl2QwZmP5HT0xMzNzf48DACwrPgQSAKAQRwAAhTgCACjEEQBAIY4AAAqfmgzLUGvtsRl8yvpnktzde3/fYV6lJMnw82Eu6r2/7ACec1WST/TerxrXegHLiyNHsAz13m/qvW9M8qQMriZ+pDgjyUmHeyWA5c3nHMEyNPyk2NdnECN3J/mdJLckeUeSh2Xw4aIv771f31p7bZLJDD6yf22SN2Twyb9nJ/likueVaxbNt6w/yuAo1Z4kH+m9v2x4tOd7GVwM8ieTvC6DD439UpIHJXlT7/0NI+Y3kcEnxf9qkjsy+ADUd/Xer2qtza7bQ4aP/cZwuqf23l8wfP5rk+zsvb/xQMYMWD4cOYLla3eSDyd5de/9Y0kuT3Jl7/2sJL+W5B3DC2omyc8lOS+Dq2K/O8kbk/xskjOT/PyoBbTWHpfk95M8fjjdWa21s4YPPyzJOcNlXdZ7/26SVyf58KgwGroog+tV/UyS5yQ5fbis05OsS/KLvfdHJtma5OIk1yQ5v2zLbya5+oEGB1i+xBEw6/wkr2ut3ZLkoxl8RP9pw8c+Prx48e1J/qv3vnn49bcyuHzCKOdmcLToe7333b3388tFOaeHR5y+kn0voPxAzkuyqfd+T+99W5LrkqT3/vUMLhGwsbX2piRPTPKg3vvdw2kubK2dk+Qbvfc7DmB5wDLjhGxg1soMXn76TpK01n4iyX8neXaSH5bpdh/APO/J4DqAGc7zoRlcNDJJfpAkvfeZ1tqBrOdMBtcM3Gd9hkek3pfkzRlcN2lPme7KJK9M8o0kVx3IwoDlx5EjWN52594/km7I4CWwtNbOyOCIzgmHOP8bk1zQWntQa21VBvHy2P1cn1GuT/Lc1tpxrbUTk/zK8P5zM3jX2tuTfC2Dc41WJknv/cYkP5Xkl5J86GA3BlgexBEsb9cneUVr7deT/EGSJ7TWvpTBeToX9963H8rMe+9fSPK2DK6Q/cUkn+q9X38/T/nscB3+8n7mOZXkExnE24eTbB4+dE2SR7fWvjx8/KYkP12euinJDb33XQe1McCy4d1qwJI2fHfbsUk+nuQlw2ADGMk5R8Ahaa2dluTaEQ9v7L3fdBDzPCfJW0c8fMEBnlB9cgZHl94pjID94cgRAEDhnCMAgEIcAQAU4ggAoBBHAACFOAIAKP4fxtCzxdSpUj8AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 720x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(10,4))\n",
    "plt.xlim(-100, 3000)\n",
    "sns.boxplot(x=train.item_cnt_day)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0xb5f8160>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAj8AAAEFCAYAAAAMpYZhAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAADYxJREFUeJzt3X+w5XVdx/HX3V1+7KU1lnGnwcSYwelDGEEoUZMZk4apA+tgA47ANDDmWE7qVJaSP9CsGZIYJZPIBJzEWBN0q0nZHCPtF6JJ5uB+KGt0aKT4KT8u47Ls7Y9zLlzoXu45u+fcde/78Zhh5p7v+Xz3fs5nvux57vd8zzkz8/PzAQCoYt3+ngAAwGoSPwBAKeIHAChF/AAApYgfAKCUDaMO3L370fl7752b5lxK2Lx5NtZxMqzlZFjHybCOk2MtJ6P6Om7ZsmlmuftGPvOzYcP6ycymOOs4OdZyMqzjZFjHybGWk2Edl+dlLwCgFPEDAJQifgCAUsQPAFCK+AEAShE/AEAp4gcAKEX8AACliB8AoBTxAwCUIn4AgFLEDwBQivgBAEoRPwBAKeIHAChF/AAApYgfAKAU8QMAlCJ+AIBSNow68Oyzz86ePfN55jOPyoUXXjTFKQEATM/I8TM3N5ckuffee6Y2GQCAaRs5fpKZ6c0CAGCVuOYHAChF/AAApYgfAKAU8QMAlCJ+AIBSxA8AUIr4AQBKET8AQCniBwAoRfwAAKWIHwCgFPEDAJQifgCAUsQPAFCK+AEAShE/AEAp4gcAKEX8AACliB8AoBTxAwCUIn4AgFLEDwBQivgBAEoRPwBAKeIHAChF/AAApYgfAKAU8QMAlCJ+AIBSxA8AUIr4AQBKET8AQCniBwAoRfwAAKWIHwCgFPEDAJQifgCAUsQPAFCK+AEAShE/AEAp4gcAKGWM+JlPkjz00EP52MeumdJ0AACma+wzP7t2fSc333zTNOYCADB1XvYCAEoRPwBAKeIHAChF/AAApYgfAKAU8QMAlCJ+AIBSxA8AUIr4AQBKET8AQCniBwAoRfwAAKWIHwCgFPEDAJQifgCAUsQPAFCK+AEAShE/AEAp4gcAKEX8AACliB8AoBTxAwCUIn4AgFLEDwBQivgBAEoRPwBAKeIHAChF/AAApYgfAKAU8QMAlCJ+AIBSxA8AUIr4AQBKET8AQCniBwAoRfwAAKWIHwCgFPEDAJSyYdwd9uzZk7vvvisXXPCqacxnv5idnc0ZZ5yZa6/9SJLkxBNPeuy+Y489Lkly2mkvTZLs2PHXSZKdO299wp/x+tf/+oq/Z+fOW3PHHbO57765J/zZo1j4fePsAwCjqvQ8M3b8rEVzc3PZvv36x27fcsu/PPZz7zuTPB4/C+Mefnhu7N+zfft1Oeig9XnkkUeTjHeAbd9+3dj7AMCoKj3PjBk/89OZxXeB5WJmYfvCGZ/lxl122SVPefZn585b0/vX/t+2UQ6yxfuOug8AjKra84xrfka0ffv1Tzg79GSLzxYtvf91I21bad9R9wGAUVV7nhE/AEAp4mdEW7eema1bz1z2/sUXSS+9/ytG2rbSvqPuAwCjqvY8M+Y1PzNZq9f9bNw4u+T1PBs3ziZZ+YLnld7tdeyxx6W1H9qrC54X9h1nHwAYVbXnGe/2ymhvdV+wcPbnyW91H8XWra/I4Yc//lb3cfcFgGmp9DwzMz8/2pmc008/fT6Zybp1M9m8+Yi85z2XTXlqa9OWLZty550P7O9prAnWcjKs42RYx8mxlpNRfR23bNk0s9x9rvkBAEoRPwBAKeIHAChF/AAApYgfAKAU8QMAlCJ+AIBSxA8AUIr4AQBKET8AQCniBwAoRfwAAKWIHwCgFPEDAJQifgCAUsQPAFCK+AEAShE/AEAp4gcAKEX8AACliB8AoBTxAwCUIn4AgFLEDwBQivgBAEoRPwBAKeIHAChF/AAApYgfAKAU8QMAlCJ+AIBSxA8AUIr4AQBKET8AQCniBwAoRfwAAKWIHwCgFPEDAJQifgCAUsaOn4MPPiQnn3zKNOYCADB1Y8TPTJLksMMOy1lnnTOl6QAATJeXvQCAUsQPAFCK+AEAShE/AEAp4gcAKEX8AACliB8AoBTxAwCUIn4AgFLEDwBQivgBAEoRPwBAKeIHAChF/AAApYgfAKAU8QMAlCJ+AIBSxA8AUIr4AQBKET8AQCniBwAoRfwAAKWIHwCgFPEDAJQifgCAUsQPAFCK+AEAShE/AEAp4gcAKEX8AACliB8AoBTxAwCUIn4AgFLEDwBQivgBAEoRPwBAKeIHAChF/AAApYgfAKAU8QMAlLJh9KHz05sFAMAqGTl+Zmdns2fPfDZvPmKa8wEAmKqR42fbtm25884HpjkXAICpc80PAFCK+AEAShE/AEAp4gcAKEX8AACliB8AoBTxAwCUIn4AgFLEDwBQivgBAEoRPwBAKeIHAChF/AAApYgfAKAU8QMAlCJ+AIBSxA8AUIr4AQBKET8AQCniBwAoZWZ+fn5/zwEAYNU48wMAlCJ+AIBSxA8AUIr4AQBKET8AQCniBwAoRfwAAKVsWGlAa21dkg8kOSHJd5K8uvf+H9Oe2IGitfblJN8e3vyvJFckeV+S3Ul29N7fudwattZ+fNSxq/qgVlFr7ZQkF/feT22tPTvJ1Unmk3w1yet673taa+9I8rIM1umNvfcvTGLsaj7O1fCktTwpyV8m+ffh3Zf33rdZy+W11g5KcmWSo5MckuTdSW6NY3Isy6zj7XE8jq21tj7JB5O0JI8mOT/JTByT+2yUMz8vT3Jo7/0nkrw5ye9Pd0oHjtbaoUnSez91+N/5Sf4oyauSPD/JKcMnoeXWcJyxa05r7TeS/EmSQ4ebLk3y1t77T2XwP/jW4Zr8dJJTkrwyyR9OYuy0H9tqW2ItT0py6aJjc5u1XNG5Se4ePraXJHl/HJN7Y6l1dDzundOTpPf+k0nensHjdUxOwCjx8/wkn06S3vs/J3neVGd0YDkhyWxrbUdr7bOttRckOaT3/vXe+3ySG5K8MEusYWvtaaOOXfVHtXq+nuTMRbefm+Tvhj9/KsmLMliPHb33+d77N5NsaK1tmcDYtWaptXxZa+1zrbUPtdY2xVqu5M+TvG3R7d1xTO6N5dbR8Tim3vsnk7xmePMHkvxPHJMTMUr8PC2Pv6yTJI+21lZ8uayIuSSXJHlxktcmuWq4bcEDSb43S6zhcNv9o4xdq+vde78uySOLNs0MQzBZfj0Wtu/r2DVlibX8QpI39d5fkOQ/k7wj1vIp9d4f7L0/MHxi/niSt8YxObZl1tHxuJd677tbax9O8gcZrKdjcgJGiZ/7k2xavE/vffeU5nOguS3JR4YFfVsGB9QRi+7flOS+LLGGS2xbdmyh9V78+vJy67GwfV/HrnWf6L1/aeHnJD8aa7mi1tpRSf42yZ/23j8ax+ReWWIdHY/7oPf+C0l+MIPrfzYuussxuZdGiZ9/SPLSJBleoPtvU53RgeWCDK/Jaa09I8lskodaa8e01mYyOCP0+Syxhr33+5PsGmXs6j6k/erLrbVThz+/JI+vx4tba+taa8/KIAbvmsDYte6G1tqPDX9+YZIvxVo+pdba9yXZkeQ3e+9XDjc7Jse0zDo6HvdCa+281tpbhjfnMgiULzom990oL6d8IsnPttb+MYOLoM6f7pQOKB9KcnVr7e8zuEL+ggwOzmuSrM/gddWbWms3Z+k1fO0YYyv4tSQfbK0dnORrST7ee3+0tfb5JP+UQay/bhJjV+0R7T+/lOT9rbVdSe5I8pre+/3W8ildmGRzkre11hauWXlDkssck2NZah1/Ncl7HY9juz7JVa21zyU5KMkbM3ic/p7cRzPz8/MrjwIAWCN8yCEAUIr4AQBKET8AQCniBwAoRfwAAKWsyU8OBiajtfa8DD6S4aYkD/be/+y7YT6991fvz3kABzZvdQdW1Fq7OsmNvfer9/NUAPaZ+AGWNfzE13cnOS7Jg0l+McktSa5IclQGH+r5lt77Z1prFyV5VgYfw78lye9k8Gm+pyT51ySvXPTdQU/+PUcn+YskO5M8J8k3kpzbe7+ntXZnki8mOTLJm5L8Vu/91NbaicN5zCa5J8k5vffbW2tvTnJWBh8eekMGnzTsLzrgMa75AVayO4MweXvv/YYk70tyZe/9uUnOSHLF8Essk+T4JKdm8E3UVyW5OMkPJzkpyY+s8HuOT/KB3vtzMviE2YuG25+e5OLe+4l54pe3XpPkt3vvxye5NskbWms/l8G3U5+cwfdHfX+Sc/buYQNrlWt+gHG9KMmxrbV3DW8flOSY4c9/M/wW6m8k+Vbv/dYkaa39dwZfefBUbuu93zj8+cNJPrrovpsWD2ytPT3Jkb33v0qS3vvlw+2XZHCmaeFLNDcm+eZ4Dw9Y68QPMK71SX6m935PkrTWjkzyv0lenmTXonG7x/xzF49ft/h27/3hJ419JIPv08twDocmecZwbu/tvV863H74XswDWOO87AWMYnce/8fSZ5P8cpK01o5L8tUMrrvZV214HU8y+ELfTy03sPf+7SS3t9ZOG246L8m7hnM7r7X2Pa21DUk+meTnJzA3YA1x5gcYxWeS/G5r7b4kv5Lkj1trX0kyk8GFyQ+01vb1d9yT5J2ttWcn+UqSld7Ofm6Sy1trv5fkriTn9d6/1Vo7IYOXydYn+XQGL6EBPMa7vYD9bvhurxt770fv56kABTjzA6yK1toxSa5b5m4fWgisGmd+AIBSXPAMAJQifgCAUsQPAFCK+AEAShE/AEAp/wfCaJwnoNOKTgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 720x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(10,4))\n",
    "plt.xlim(train.item_price.min(), train.item_price.max()*1.1)\n",
    "sns.boxplot(x=train.item_price)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train[train.item_price<100000]\n",
    "train = train[train.item_cnt_day<1001]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()\n",
    "train.loc[train.item_price<0, 'item_price'] = median"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Якутск Орджоникидзе, 56\n",
    "train.loc[train.shop_id == 0, 'shop_id'] = 57\n",
    "test.loc[test.shop_id == 0, 'shop_id'] = 57\n",
    "# Якутск ТЦ \"Центральный\"\n",
    "train.loc[train.shop_id == 1, 'shop_id'] = 58\n",
    "test.loc[test.shop_id == 1, 'shop_id'] = 58\n",
    "# Жуковский ул. Чкалова 39м²\n",
    "train.loc[train.shop_id == 10, 'shop_id'] = 11\n",
    "test.loc[test.shop_id == 10, 'shop_id'] = 11"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "shops.loc[shops.shop_name == 'Сергиев Посад ТЦ \"7Я\"', 'shop_name'] = 'СергиевПосад ТЦ \"7Я\"'\n",
    "shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])\n",
    "shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'\n",
    "shops['city_code'] = LabelEncoder().fit_transform(shops['city'])\n",
    "shops = shops[['shop_id','city_code']]\n",
    "\n",
    "cats['split'] = cats['item_category_name'].str.split('-')\n",
    "cats['type'] = cats['split'].map(lambda x: x[0].strip())\n",
    "cats['type_code'] = LabelEncoder().fit_transform(cats['type'])\n",
    "# if subtype is nan then type\n",
    "cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())\n",
    "cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])\n",
    "cats = cats[['item_category_id','type_code', 'subtype_code']]\n",
    "\n",
    "items.drop(['item_name'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(363, 5100, 42, 214200)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(list(set(test.item_id) - set(test.item_id).intersection(set(train.item_id)))), len(list(set(test.item_id))), len(list(set(test.shop_id))),len(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "42"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(list(set(test.shop_id)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11.833183288574219"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts = time.time()\n",
    "matrix = []\n",
    "cols = ['date_block_num','shop_id','item_id']\n",
    "for i in range(34):\n",
    "    sales = train[train.date_block_num==i]\n",
    "    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))\n",
    "    \n",
    "matrix = pd.DataFrame(np.vstack(matrix), columns=cols)\n",
    "matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)\n",
    "matrix['shop_id'] = matrix['shop_id'].astype(np.int8)\n",
    "matrix['item_id'] = matrix['item_id'].astype(np.int16)\n",
    "matrix.sort_values(cols,inplace=True)\n",
    "time.time() - ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['revenue'] = train['item_price'] *  train['item_cnt_day']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.752575159072876"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts = time.time()\n",
    "group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})\n",
    "group.columns = ['item_cnt_month']\n",
    "group.reset_index(inplace=True)\n",
    "\n",
    "matrix = pd.merge(matrix, group, on=cols, how='left')\n",
    "matrix['item_cnt_month'] = (matrix['item_cnt_month']\n",
    "                                .fillna(0)\n",
    "                                .clip(0,20) # NB clip target here\n",
    "                                .astype(np.float16))\n",
    "time.time() - ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date_block_num</th>\n",
       "      <th>shop_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>item_cnt_month</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>27</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>33</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>317</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>438</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>471</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   date_block_num  shop_id  item_id  item_cnt_month\n",
       "0               0        2       27             1.0\n",
       "1               0        2       33             1.0\n",
       "2               0        2      317             1.0\n",
       "3               0        2      438             1.0\n",
       "4               0        2      471             2.0"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date_block_num</th>\n",
       "      <th>shop_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>item_cnt_month</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>19</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>27</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>28</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>29</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>32</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   date_block_num  shop_id  item_id  item_cnt_month\n",
       "0               0        2       19             0.0\n",
       "1               0        2       27             1.0\n",
       "2               0        2       28             0.0\n",
       "3               0        2       29             0.0\n",
       "4               0        2       32             0.0"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matrix.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "test['date_block_num'] = 34\n",
    "test['date_block_num'] = test['date_block_num'].astype(np.int8)\n",
    "test['shop_id'] = test['shop_id'].astype(np.int8)\n",
    "test['item_id'] = test['item_id'].astype(np.int16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.1070106029510498"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts = time.time()\n",
    "matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)\n",
    "matrix.fillna(0, inplace=True) # 34 month\n",
    "time.time() - ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6.388638734817505"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts = time.time()\n",
    "matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')\n",
    "matrix = pd.merge(matrix, items, on=['item_id'], how='left')\n",
    "matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')\n",
    "matrix['city_code'] = matrix['city_code'].astype(np.int8)\n",
    "matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)\n",
    "matrix['type_code'] = matrix['type_code'].astype(np.int8)\n",
    "matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)\n",
    "time.time() - ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lag_feature(df, lags, col):\n",
    "    tmp = df[['date_block_num','shop_id','item_id',col]]\n",
    "    for i in lags:\n",
    "        shifted = tmp.copy()\n",
    "        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]\n",
    "        shifted['date_block_num'] += i\n",
    "        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "47.04199981689453"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts = time.time()\n",
    "matrix = lag_feature(matrix, [1,2,3,6,12], 'item_cnt_month')\n",
    "time.time() - ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date_block_num</th>\n",
       "      <th>shop_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>item_cnt_month</th>\n",
       "      <th>city_code</th>\n",
       "      <th>item_category_id</th>\n",
       "      <th>type_code</th>\n",
       "      <th>subtype_code</th>\n",
       "      <th>item_cnt_month_lag_1_x</th>\n",
       "      <th>item_cnt_month_lag_2_x</th>\n",
       "      <th>item_cnt_month_lag_3_x</th>\n",
       "      <th>item_cnt_month_lag_6_x</th>\n",
       "      <th>item_cnt_month_lag_12_x</th>\n",
       "      <th>item_cnt_month_lag_1_y</th>\n",
       "      <th>item_cnt_month_lag_2_y</th>\n",
       "      <th>item_cnt_month_lag_3_y</th>\n",
       "      <th>item_cnt_month_lag_6_y</th>\n",
       "      <th>item_cnt_month_lag_12_y</th>\n",
       "      <th>item_cnt_month_lag_1</th>\n",
       "      <th>item_cnt_month_lag_2</th>\n",
       "      <th>item_cnt_month_lag_3</th>\n",
       "      <th>item_cnt_month_lag_6</th>\n",
       "      <th>item_cnt_month_lag_12</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>19</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>27</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>19</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>28</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>30</td>\n",
       "      <td>8</td>\n",
       "      <td>55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>29</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>23</td>\n",
       "      <td>5</td>\n",
       "      <td>16</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>32</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   date_block_num  shop_id  item_id  item_cnt_month  city_code  \\\n",
       "0               0        2       19             0.0          0   \n",
       "1               0        2       27             1.0          0   \n",
       "2               0        2       28             0.0          0   \n",
       "3               0        2       29             0.0          0   \n",
       "4               0        2       32             0.0          0   \n",
       "\n",
       "   item_category_id  type_code  subtype_code  item_cnt_month_lag_1_x  \\\n",
       "0                40         11             4                     NaN   \n",
       "1                19          5            10                     NaN   \n",
       "2                30          8            55                     NaN   \n",
       "3                23          5            16                     NaN   \n",
       "4                40         11             4                     NaN   \n",
       "\n",
       "   item_cnt_month_lag_2_x  item_cnt_month_lag_3_x  item_cnt_month_lag_6_x  \\\n",
       "0                     NaN                     NaN                     NaN   \n",
       "1                     NaN                     NaN                     NaN   \n",
       "2                     NaN                     NaN                     NaN   \n",
       "3                     NaN                     NaN                     NaN   \n",
       "4                     NaN                     NaN                     NaN   \n",
       "\n",
       "   item_cnt_month_lag_12_x  item_cnt_month_lag_1_y  item_cnt_month_lag_2_y  \\\n",
       "0                      NaN                     NaN                     NaN   \n",
       "1                      NaN                     NaN                     NaN   \n",
       "2                      NaN                     NaN                     NaN   \n",
       "3                      NaN                     NaN                     NaN   \n",
       "4                      NaN                     NaN                     NaN   \n",
       "\n",
       "   item_cnt_month_lag_3_y  item_cnt_month_lag_6_y  item_cnt_month_lag_12_y  \\\n",
       "0                     NaN                     NaN                      NaN   \n",
       "1                     NaN                     NaN                      NaN   \n",
       "2                     NaN                     NaN                      NaN   \n",
       "3                     NaN                     NaN                      NaN   \n",
       "4                     NaN                     NaN                      NaN   \n",
       "\n",
       "   item_cnt_month_lag_1  item_cnt_month_lag_2  item_cnt_month_lag_3  \\\n",
       "0                   NaN                   NaN                   NaN   \n",
       "1                   NaN                   NaN                   NaN   \n",
       "2                   NaN                   NaN                   NaN   \n",
       "3                   NaN                   NaN                   NaN   \n",
       "4                   NaN                   NaN                   NaN   \n",
       "\n",
       "   item_cnt_month_lag_6  item_cnt_month_lag_12  \n",
       "0                   NaN                    NaN  \n",
       "1                   NaN                    NaN  \n",
       "2                   NaN                    NaN  \n",
       "3                   NaN                    NaN  \n",
       "4                   NaN                    NaN  "
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matrix.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17.70300006866455"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts = time.time()\n",
    "group = matrix.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})\n",
    "group.columns = [ 'date_avg_item_cnt' ]\n",
    "group.reset_index(inplace=True)\n",
    "\n",
    "matrix = pd.merge(matrix, group, on=['date_block_num'], how='left')\n",
    "matrix['date_avg_item_cnt'] = matrix['date_avg_item_cnt'].astype(np.float16)\n",
    "matrix = lag_feature(matrix, [1], 'date_avg_item_cnt')\n",
    "matrix.drop(['date_avg_item_cnt'], axis=1, inplace=True)\n",
    "time.time() - ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date_block_num</th>\n",
       "      <th>shop_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>item_cnt_month</th>\n",
       "      <th>city_code</th>\n",
       "      <th>item_category_id</th>\n",
       "      <th>type_code</th>\n",
       "      <th>subtype_code</th>\n",
       "      <th>item_cnt_month_lag_1_x</th>\n",
       "      <th>item_cnt_month_lag_2_x</th>\n",
       "      <th>item_cnt_month_lag_3_x</th>\n",
       "      <th>item_cnt_month_lag_6_x</th>\n",
       "      <th>item_cnt_month_lag_12_x</th>\n",
       "      <th>item_cnt_month_lag_1_y</th>\n",
       "      <th>item_cnt_month_lag_2_y</th>\n",
       "      <th>item_cnt_month_lag_3_y</th>\n",
       "      <th>item_cnt_month_lag_6_y</th>\n",
       "      <th>item_cnt_month_lag_12_y</th>\n",
       "      <th>item_cnt_month_lag_1_x</th>\n",
       "      <th>item_cnt_month_lag_2_x</th>\n",
       "      <th>item_cnt_month_lag_3_x</th>\n",
       "      <th>item_cnt_month_lag_6_x</th>\n",
       "      <th>item_cnt_month_lag_12_x</th>\n",
       "      <th>item_cnt_month_lag_1_y</th>\n",
       "      <th>item_cnt_month_lag_2_y</th>\n",
       "      <th>item_cnt_month_lag_3_y</th>\n",
       "      <th>item_cnt_month_lag_6_y</th>\n",
       "      <th>item_cnt_month_lag_12_y</th>\n",
       "      <th>date_avg_item_cnt_lag_1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>19</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>27</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>19</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>28</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>30</td>\n",
       "      <td>8</td>\n",
       "      <td>55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>29</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>23</td>\n",
       "      <td>5</td>\n",
       "      <td>16</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>32</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   date_block_num  shop_id  item_id  item_cnt_month  city_code  \\\n",
       "0               0        2       19             0.0          0   \n",
       "1               0        2       27             1.0          0   \n",
       "2               0        2       28             0.0          0   \n",
       "3               0        2       29             0.0          0   \n",
       "4               0        2       32             0.0          0   \n",
       "\n",
       "   item_category_id  type_code  subtype_code  item_cnt_month_lag_1_x  \\\n",
       "0                40         11             4                     NaN   \n",
       "1                19          5            10                     NaN   \n",
       "2                30          8            55                     NaN   \n",
       "3                23          5            16                     NaN   \n",
       "4                40         11             4                     NaN   \n",
       "\n",
       "   item_cnt_month_lag_2_x  item_cnt_month_lag_3_x  item_cnt_month_lag_6_x  \\\n",
       "0                     NaN                     NaN                     NaN   \n",
       "1                     NaN                     NaN                     NaN   \n",
       "2                     NaN                     NaN                     NaN   \n",
       "3                     NaN                     NaN                     NaN   \n",
       "4                     NaN                     NaN                     NaN   \n",
       "\n",
       "   item_cnt_month_lag_12_x  item_cnt_month_lag_1_y  item_cnt_month_lag_2_y  \\\n",
       "0                      NaN                     NaN                     NaN   \n",
       "1                      NaN                     NaN                     NaN   \n",
       "2                      NaN                     NaN                     NaN   \n",
       "3                      NaN                     NaN                     NaN   \n",
       "4                      NaN                     NaN                     NaN   \n",
       "\n",
       "   item_cnt_month_lag_3_y  item_cnt_month_lag_6_y  item_cnt_month_lag_12_y  \\\n",
       "0                     NaN                     NaN                      NaN   \n",
       "1                     NaN                     NaN                      NaN   \n",
       "2                     NaN                     NaN                      NaN   \n",
       "3                     NaN                     NaN                      NaN   \n",
       "4                     NaN                     NaN                      NaN   \n",
       "\n",
       "   item_cnt_month_lag_1_x  item_cnt_month_lag_2_x  item_cnt_month_lag_3_x  \\\n",
       "0                     NaN                     NaN                     NaN   \n",
       "1                     NaN                     NaN                     NaN   \n",
       "2                     NaN                     NaN                     NaN   \n",
       "3                     NaN                     NaN                     NaN   \n",
       "4                     NaN                     NaN                     NaN   \n",
       "\n",
       "   item_cnt_month_lag_6_x  item_cnt_month_lag_12_x  item_cnt_month_lag_1_y  \\\n",
       "0                     NaN                      NaN                     NaN   \n",
       "1                     NaN                      NaN                     NaN   \n",
       "2                     NaN                      NaN                     NaN   \n",
       "3                     NaN                      NaN                     NaN   \n",
       "4                     NaN                      NaN                     NaN   \n",
       "\n",
       "   item_cnt_month_lag_2_y  item_cnt_month_lag_3_y  item_cnt_month_lag_6_y  \\\n",
       "0                     NaN                     NaN                     NaN   \n",
       "1                     NaN                     NaN                     NaN   \n",
       "2                     NaN                     NaN                     NaN   \n",
       "3                     NaN                     NaN                     NaN   \n",
       "4                     NaN                     NaN                     NaN   \n",
       "\n",
       "   item_cnt_month_lag_12_y  date_avg_item_cnt_lag_1  \n",
       "0                      NaN                      NaN  \n",
       "1                      NaN                      NaN  \n",
       "2                      NaN                      NaN  \n",
       "3                      NaN                      NaN  \n",
       "4                      NaN                      NaN  "
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matrix.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'city_code',\n",
       "       'item_category_id', 'type_code', 'subtype_code',\n",
       "       'item_cnt_month_lag_1_x', 'item_cnt_month_lag_2_x',\n",
       "       'item_cnt_month_lag_3_x', 'item_cnt_month_lag_6_x',\n",
       "       'item_cnt_month_lag_12_x', 'item_cnt_month_lag_1_y',\n",
       "       'item_cnt_month_lag_2_y', 'item_cnt_month_lag_3_y',\n",
       "       'item_cnt_month_lag_6_y', 'item_cnt_month_lag_12_y',\n",
       "       'item_cnt_month_lag_1_x', 'item_cnt_month_lag_2_x',\n",
       "       'item_cnt_month_lag_3_x', 'item_cnt_month_lag_6_x',\n",
       "       'item_cnt_month_lag_12_x', 'item_cnt_month_lag_1_y',\n",
       "       'item_cnt_month_lag_2_y', 'item_cnt_month_lag_3_y',\n",
       "       'item_cnt_month_lag_6_y', 'item_cnt_month_lag_12_y',\n",
       "       'date_avg_item_cnt_lag_1'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matrix.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 11128004 entries, 0 to 11128003\n",
      "Data columns (total 29 columns):\n",
      "date_block_num             int8\n",
      "shop_id                    int8\n",
      "item_id                    int16\n",
      "item_cnt_month             float16\n",
      "city_code                  int8\n",
      "item_category_id           int8\n",
      "type_code                  int8\n",
      "subtype_code               int8\n",
      "item_cnt_month_lag_1_x     float16\n",
      "item_cnt_month_lag_2_x     float16\n",
      "item_cnt_month_lag_3_x     float16\n",
      "item_cnt_month_lag_6_x     float16\n",
      "item_cnt_month_lag_12_x    float16\n",
      "item_cnt_month_lag_1_y     float16\n",
      "item_cnt_month_lag_2_y     float16\n",
      "item_cnt_month_lag_3_y     float16\n",
      "item_cnt_month_lag_6_y     float16\n",
      "item_cnt_month_lag_12_y    float16\n",
      "item_cnt_month_lag_1_x     float16\n",
      "item_cnt_month_lag_2_x     float16\n",
      "item_cnt_month_lag_3_x     float16\n",
      "item_cnt_month_lag_6_x     float16\n",
      "item_cnt_month_lag_12_x    float16\n",
      "item_cnt_month_lag_1_y     float16\n",
      "item_cnt_month_lag_2_y     float16\n",
      "item_cnt_month_lag_3_y     float16\n",
      "item_cnt_month_lag_6_y     float16\n",
      "item_cnt_month_lag_12_y    float16\n",
      "date_avg_item_cnt_lag_1    float16\n",
      "dtypes: float16(22), int16(1), int8(6)\n",
      "memory usage: 636.7 MB\n"
     ]
    }
   ],
   "source": [
    "matrix.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
